# Se computa la matriz H y los grados de libertad
ld = lambdas_to_try[lambda] * diag(ncol(X_scaled))
H = X_scaled %*% solve(t(X_scaled) %*% X_scaled + ld) %*% t(X_scaled)
df = tr(H)
# Se computan los criterios de informacion
aic[lambda] = nrow(X_scaled) * log(t(resid) %*% resid) + 2 * df
bic[lambda] = nrow(X_scaled) * log(t(resid) %*% resid) + 2 * df * log(nrow(X_scaled))
}
# Grafico de los criterios de informacion vs cada lambda estimado
plot(log(lambdas_to_try), aic, col = "orange", type = "l",
ylim = c(190, 260), ylab = "Information Criterion")
lines(log(lambdas_to_try), bic, col = "skyblue3")
legend("bottomright", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
# Grafico de los criterios de informacion vs cada lambda estimado
plot(log(lambdas_to_try), aic, col = "orange", type = "l",
ylab = "Information Criterion")
lines(log(lambdas_to_try), bic, col = "skyblue3")
legend("bottomright", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
legend("bottomleft", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
# Grafico de los criterios de informacion vs cada lambda estimado
plot(log(lambdas_to_try), aic, col = "orange", type = "l",
ylab = "Information Criterion")
lines(log(lambdas_to_try), bic, col = "skyblue3")
legend("bottomleft", lwd = 1, col = c("orange", "skyblue3"), legend = c("AIC", "BIC"))
# Optimal lambdas according to both criteria
lambda_aic = lambdas_to_try[which.min(aic)]
lambda_bic = lambdas_to_try[which.min(bic)]
# Fit final models, get their sum of squared residuals and multiple R-squared
model_aic = glmnet(X, y, alpha = 0, lambda = lambda_aic, standardize = TRUE)
y_hat_aic = predict(model_aic, X)
ssr_aic = t(y - y_hat_aic) %*% (y - y_hat_aic)
rsq_ridge_aic = cor(y, y_hat_aic)^2
model_bic = glmnet(X, y, alpha = 0, lambda = lambda_bic, standardize = TRUE)
y_hat_bic = predict(model_bic, X)
ssr_bic = t(y - y_hat_bic) %*% (y - y_hat_bic)
rsq_ridge_bic = cor(y, y_hat_bic)^2
# See how increasing lambda shrinks the coefficients --------------------------
# Each line shows coefficients for one variables, for different lambdas.
# The higher the lambda, the more the coefficients are shrinked towards zero.
res <- glmnet(X, y, alpha = 0, lambda = lambdas_to_try, standardize = FALSE)
plot(res, xvar = "lambda")
legend("bottomright", lwd = 1, col = 1:6, legend = colnames(X), cex = .7)
#- Como se ve en las diapositivas, en LASSO se tiene una forma funcional para la
#- penalizacion. En lo demás, es muy similar a Ridge.
#--> Se genera una secuencia de valores de \lambda para evaluar el que minimiza RMSE
lambdas_to_try = 10^seq(-2, 5, length.out = 100)
#--> Clave: alpha=1 implementa LASSO en este procedimiento
lasso_cv = cv.glmnet(X, y, alpha = 1, lambda = lambdas_to_try,
standardize = TRUE, nfolds = 10)
#--> Graficar los resultados de la validacion cruzada
plot(lasso_cv)
# \lambda que minimiza MSE
lambda_cv = lasso_cv$lambda.min
# Se ajusta modelo final, calcula RSS y R cuadrado
model_cv = glmnet(X, y, alpha = 1, lambda = lambda_cv, standardize = TRUE)
y_hat_cv = predict(model_cv, X)
ssr_cv = t(y - y_hat_cv) %*% (y - y_hat_cv)
rsq_lasso_cv = cor(y, y_hat_cv)^2
#--> Notese como incrementar el lambda genera coeficientes cercanos a cero
#--> Cada linea muestra coeficientes para cada variable, para diferentes \lambdas.
#--> Cuanto mayor el lambda, mas coeficientes se vuelven cero.
res = glmnet(X, y, alpha = 1, lambda = lambdas_to_try, standardize = FALSE)
plot(res, xvar = "lambda")
legend("bottomright", lwd = 1, col = 1:6, legend = colnames(X), cex = .7)
#-> Ridge vs LASSO segun R cuadrado
rsq = cbind("R-squared" = c(rsq_ridge_cv, rsq_ridge_aic, rsq_ridge_bic, rsq_lasso_cv))
rownames(rsq) <- c("ridge validación cruzada", "ridge AIC", "ridge BIC", "lasso Validación cruzada")
print(rsq)
install.packages("caret", dependencies = T)
library(caret)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 5,
repeats = 5,
search = "random",
verboseIter = TRUE)
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
#-> Se entrena el modelo
gdp_tr = gdp(:,2)
#-> Se entrena el modelo
gdp_tr = gdp[:,2]
#-> Se entrena el modelo
gdp_tr = gdp$gdp
elastic_net_model <- train(gdp_tr ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
search = "random",
verboseIter = TRUE)
#-> Se entrena el modelo
gdp_tr = gdp$gdp
elastic_net_model <- train(gdp_tr ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
View(gdp)
#-> Se entrena el modelo
gdp_tr = gdp[-date]
View(y)
View(y)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
search = "random",
verboseIter = TRUE)
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
View(y)
View(X)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 50,
repeats = 100,
search = "random",
verboseIter = TRUE)
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
search = "random",
verboseIter = TRUE, returnResamp = "all",
savePredictions = "all")
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
elastic_net_model$results
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 1,
repeats = 3,
search = "random",
verboseIter = TRUE, returnResamp = "all",
savePredictions = "all")
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 2,
repeats = 3,
search = "random",
verboseIter = TRUE, returnResamp = "all",
savePredictions = "all")
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
elastic_net_model$results
#-> Se analiza el RMSE
y_hat_enet <- predict(elastic_net_model, X)
rsq_enet <- cor(y, y_hat_enet)^2
rsq_enet
#-> Resultado Elastic Net
elastic_net_model
#-> Se analiza el R cuadrado
#--> Prediccion del modelo
y_hat_enet <- predict(elastic_net_model, X)
y_hat_enet
#-> Grafico
plot(elastic_model, main = "Elastic Net Regression")
#-> Grafico
plot(elastic_net_model, main = "Elastic Net Regression")
#-> Grafico del pronostico
plot(y, type = "l", lty=1)
lines(y_hat_enet, type = "l", lty=1, col=4)
#-> Se determina los parametros del entrenamiento
train_control = trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
search = "random",
verboseIter = TRUE, returnResamp = "all",
savePredictions = "all")
#-> Se entrena el modelo
elastic_net_model <- train(gdp ~ .,
data = cbind(y, X),
method = "glmnet",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = train_control)
install.packages("rsample", dependencies = T)
install.packages("randomForest", dependencies = T)
install.packages("ranger", dependencies = T)
install.packages("h2o", dependencies = T)
#-> Importar librerias requeridas
library(rsample)      # data splitting
library(randomForest) # basic implementation
library(ranger)       # a faster implementation of randomForest
library(caret)        # an aggregator package for performing many machine learning models
library(h2o)          # an extremely fast java-based platform
#-> Creacion de muestras de entrenamiento (70%) y evaluacion (30%).
set.seed(123)
ames_split = initial_split(AmesHousing::make_ames(), prop = .7)
install.packages("AmesHousing", dependencies = T)
library(AmesHousing)
#-> Creacion de muestras de entrenamiento (70%) y evaluacion (30%).
set.seed(123)
ames_split = initial_split(AmesHousing::make_ames(), prop = .7)
ames_train = training(ames_split)
ames_test  = testing(ames_split)
# Limpiar workspace
rm(list = ls())
#-> Importar librerias requeridas
library(rsample)      # data splitting
library(randomForest) # basic implementation
library(ranger)       # a faster implementation of randomForest
library(caret)        # an aggregator package for performing many machine learning models
library(h2o)          # an extremely fast java-based platform
library(AmesHousing)
#-> Creacion de muestras de entrenamiento (70%) y evaluacion (30%).
set.seed(123)
ames_split = initial_split(AmesHousing::make_ames(), prop = .7)
ames_train = training(ames_split)
ames_test  = testing(ames_split)
View(ames_test)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train
)
m1
80/3
81/3
79/3
View(ames_split)
View(ames_test)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry=if (!is.null(y) && !is.factor(y))
max(floor(ncol(x)/3), 1) else floor(sqrt(ncol(x)))
)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry=if (!is.null(Sale_Price) && !is.factor(Sale_Price))
max(floor(ncol(x)/3), 1) else floor(sqrt(ncol(x)))
)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500
)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry = NULL
)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry = p/3
)
ncol(ames_test)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry = ncol(ames_test)/3
)
m1
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry = (ncol(ames_test)-1)/3
)
m1
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree=500,
mtry = (ncol(ames_test)-2)/3
)
m1
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train
)
summary(m1)
m1$terms
summary(m1)
m1$call
m1$type
m1$mtry
m1$forest
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree   = 500
)
m1
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train
)
m1
m1$terms
plot(m1)
#-> Se determina el numero optimo de arboles a entrar en la regresion
which.min(m1$mse)
#-> RMSE del random forest con el numero optimo de arboles
sqrt(which.min(m1$mse))
#-> Creacion de muestras de entrenamiento (70%) y evaluacion (30%).
set.seed(123)
ames_split = initial_split(AmesHousing::make_ames(), prop = .7)
ames_train = training(ames_split)
ames_test  = testing(ames_split)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train
)
m1
#-> Graficar el comportamiento del error a medida que se incorporan arboles
#-> Se usa OOB error como el metodo de calculo del error
plot(m1)
#-> Se determina el numero optimo de arboles a entrar en la regresion
which.min(m1$mse)
#-> RMSE del random forest con el numero optimo de arboles
sqrt(which.min(m1$mse))
#-> Creacion de muestras de entrenamiento (70%) y evaluacion (30%).
set.seed(123)
ames_split = initial_split(AmesHousing::make_ames(), prop = .7)
ames_train = training(ames_split)
ames_test  = testing(ames_split)
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train
)
m1
#-> Graficar el comportamiento del error a medida que se incorporan arboles
#-> Se usa OOB error como el metodo de calculo del error
plot(m1)
#-> Se determina el numero optimo de arboles a entrar en la regresion
which.min(m1$mse)
#-> RMSE del random forest con el numero optimo de arboles
sqrt(m1$mse[which.min(m1$mse)])
#-> Implementacion del modelo estandar
m1 <- randomForest(
formula = Sale_Price ~ .,
data    = ames_train,
ntree   = 700
)
m1
#-> Graficar el comportamiento del error a medida que se incorporan arboles
#-> Se usa OOB error como el metodo de calculo del error
plot(m1)
#-> Se determina el numero optimo de arboles a entrar en la regresion
which.min(m1$mse)
#-> RMSE del random forest con el numero optimo de arboles
sqrt(m1$mse[which.min(m1$mse)])
#-> Si se desea utilizar validacion cruzada para hallar el mejor modelo, se debe hacer:
#-> Se crea las muestras de estimacion y de evaluacion
set.seed(123)
valid_split = initial_split(ames_train, .7)
#-> Muestra de entrenamiento
ames_train_v2 = analysis(valid_split)
#-> Muestra de evaluacion
ames_valid = assessment(valid_split)
x_test = ames_valid[setdiff(names(ames_valid), "Sale_Price")]
y_test = ames_valid$Sale_Price
#-> Se estima el modelo con las muestras propuestas arriba
rf_oob_comp = randomForest(
formula = Sale_Price ~ .,
data    = ames_train_v2,
ntree   = 800,
xtest   = x_test,
ytest   = y_test
)
#-> Extraer errores OOB y de pronostico con la muestra de evaluacion
oob = sqrt(rf_oob_comp$mse)
validation = sqrt(rf_oob_comp$test$mse)
#-> Comparar las tasas de error
tibble::tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Número de árboles")
install.packages("tidyr", dependencies = T)
library(tidyr)
#-> Comparar las tasas de error
tibble::tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Número de árboles")
library(ggplot2)
#-> Comparar las tasas de error
tibble::tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Número de árboles")
#-> Comparar las tasas de error
tibble::tibble(
`Out of Bag Error` = oob,
`Error validación` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Número de árboles")
#--> Se extrae el nombre de los features
features <- setdiff(names(ames_train), "Sale_Price")
set.seed(123)
#--> Se usa tuneRF para proponer que en cada bifurcacion se pruebe con 5 variables,
#--> incrementando su cantidad por un factor de 1,5 hasta que el error deje de
#--> mejorar en 0,01 (1%).
m2 <- tuneRF(
x          = ames_train[features],
y          = ames_train$Sale_Price,
ntreeTry   = 500,
mtryStart  = 5,
stepFactor = 1.5,
improve    = 0.01,
trace      = FALSE      # to not show real-time progress
)
install.packages("iRF", dependencies = T)
library(devtools)
install.packages("devtools", dependencies = T)
library(devtools)
devtools::install_github("sumbose/iRF")
library("iPF")
library("iRF")
library(devtools)
devtools::install_github("sumbose/iRF")
library(devtools)
devtools::install_github("karlkumbier/iRF2.0")
